Objectives:

This study analyzes data from the Office for National Statistics (ONS), which includes median house prices and workplace-based earnings across England from 1997 to 2022. The analysis examines the housing market trends since 1997, with a focus on the most recent data of 2022. It addresses three key questions regarding:

  1. How has housing affordability evolved in England since 1997?
  2. How does housing affordability vary geographically in 2022?
  3. Which local authorities lead in affordability and unaffordability in 2022?

The R code in this document generates all graphs used in the report.

#### Set working directory and Load necessary libraries
setwd("C:/Users/PC - MSI/OneDrive - The University of Liverpool/An - 2025/2024 - UoL/Sem 1/ECON 705 Data Mgt/Exercise/Assignment1/Assignment1_Data")
library(readxl)
library(writexl)
library(tidyverse)
library(ggplot2)
library(sf)
library(tmap)

# =========================
# DATA PREPROCESSING
# =========================

# Import required Excel sheets
raw_house_price_region = read_excel("house_price_earnings.xlsx", sheet= "1a", skip=1, na = ":")
raw_earnings_region = read_excel("house_price_earnings.xlsx", sheet= "1b", skip=1, na = ":")
raw_house_price_district = read_excel("house_price_earnings.xlsx", sheet= "5a", skip=1, na = ":")
raw_earnings_district = read_excel("house_price_earnings.xlsx", sheet= "5b", skip=1, na = ":")
missing_local_data = read_excel("Local_Authorities_Missing_Data.xlsx")
housing_supply = read_excel("Data_Housing_Supply_England_2022.xlsx", skip=2)
uk_map <- st_read("Local_Authority_Districts_May_2024_Boundaries_UK_BFE/LAD_MAY_2024_UK_BFE.shp")

# Clean and pivot wide to long table
clean_long_format <- function(data, col_range, value_name) {
  data %>%
    pivot_longer(cols = all_of(col_range), names_to = "Year", values_to = value_name) %>%
    mutate(
      Year = as.numeric(gsub("[^0-9]", "", Year)),
      !!value_name := as.numeric(.data[[value_name]])
    )
}

# Tranform Data
house_price_region <- clean_long_format(raw_house_price_region, starts_with("Year"), "Median_House_Price")
earnings_region <- clean_long_format(raw_earnings_region, names(raw_earnings_region)[3:28], "Median_Earnings")
house_price_district <- clean_long_format(raw_house_price_district, starts_with("Year"), "Median_House_Price")
earnings_district <- clean_long_format(raw_earnings_district, names(raw_earnings_district)[5:30], "Median_Earnings")

# Impute missing earning data with median of the region in that year
earnings_district <- earnings_district %>%
  group_by(`Region code`, Year) %>%
  mutate(Median_Earnings = ifelse(is.na(Median_Earnings), median(Median_Earnings, na.rm = TRUE), Median_Earnings)) %>%
  ungroup()

# Merge and clean region data
region_data <- left_join(house_price_region, earnings_region, by = c("Code", "Year", "Name")) %>%
  mutate(
    Affordability_Ratio = Median_House_Price / Median_Earnings,
    Region_Code = Code,
    Region_Name = Name
  ) %>%
  filter(!Region_Name %in% c("England and Wales", "Wales"))

# Merge and clean district data
district_data <- left_join(house_price_district, earnings_district, 
                           by = c("Local authority code" = "Code", "Local authority name" = "Name", "Year")) %>%
  select(-`Region code.y`, -`Region name.y`) %>%
  mutate(
    Affordability_Ratio = Median_House_Price / Median_Earnings,
    Region_Code = `Region code.x`,
    Region_Name = `Region name.x`,
    Local_Authority_Code = `Local authority code`,
    Local_Authority_Name = `Local authority name`
  ) %>%
  filter(!Region_Name %in% c("England and Wales", "Wales")) %>%
  bind_rows(missing_local_data)

# =========================
# DATA ANALYSIS
# =========================

# ===================================================
# FIGURE 1. Affordability ratio over time for England
region_data %>%
  filter(Region_Name == 'England') %>%
  ggplot( aes(x = Year, y = Affordability_Ratio)) +
  # Highlight periods
  geom_rect(aes(xmin = 1997, xmax = 2008, ymin = -Inf, ymax = Inf), 
            fill = "#f0f0f0", alpha = 0.3, data = NULL) +
  geom_rect(aes(xmin = 2008, xmax = 2013, ymin = -Inf, ymax = Inf), 
            fill = "#e0ffff", alpha = 0.2, data = NULL) +
  geom_rect(aes(xmin = 2013, xmax = 2022, ymin = -Inf, ymax = Inf), 
            fill = "#fffaf0", alpha = 0.3, data = NULL) +  
  geom_line(color = "darkblue", linewidth = 1) +  # Affordability Ratio line
  # Add threshold line
  geom_hline(yintercept = 5, color = "darkblue", linetype = "dashed", linewidth = 0.8) +  # Set x-axis to show each year
  scale_x_continuous(breaks = seq(1998, 2022, by = 2)) +
  scale_y_continuous(limits = c(0, 10)) +
  geom_point(data = subset(region_data, Region_Name == 'England' & Year %in% c(2002,2009, 2021)),
             aes(x = Year, y = Affordability_Ratio), color = "red", size = 2) +
  labs(title = "Affordability Ratio Over Time in England") +
  theme_minimal() +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_blank(),
  )

# =====================================================
# FIGURE 2. House prices, earnings over time for England
region_data[region_data$Region_Name == 'England',] %>%
  ggplot() +
  geom_line(aes(x = Year, y = Median_House_Price / 1000, color = "House Price (\u00a3000s)"), linewidth = 1) +
  geom_line(aes(x = Year, y = 5 * Median_Earnings / 1000, color = "5-Year Earnings (\u00a3000s)"), linewidth = 1) +
  geom_point(data = subset(region_data, Region_Name == 'England' & Year %in% c(2008, 2021)), 
             aes(x = Year, y = Median_House_Price / 1000), color = "darkblue", size = 3) +
  geom_point(data = subset(region_data, Region_Name == 'England' & Year %in% c(2008, 2021)), 
             aes(x = Year, y = 5 * Median_Earnings / 1000), color = "#56B4E9", size = 3) +
  geom_text(data = subset(region_data, Region_Name == 'England' & Year %in% c(2008, 2021)),
            aes(x = Year, y = Median_House_Price / 1000, label = round(Median_House_Price / 1000, 1)),
            vjust = -1, color = "darkblue", size = 3) +
  geom_text(data = subset(region_data, Region_Name == 'England' & Year %in% c(2008, 2021)),
            aes(x = Year, y = 5 * Median_Earnings / 1000, label = round(5 * Median_Earnings / 1000, 1)),
            vjust = 1.5, color = "#56B4E9", size = 3) +
  scale_color_manual(values = c("House Price (\u00a3000s)" = "darkblue", 
                                "5-Year Earnings (\u00a3000s)" = "#56B4E9")) +
  scale_x_continuous(breaks = seq(1998, 2022, by = 2)) +
  scale_y_continuous(limits = c(0, 300)) +
  labs(title = "House Prices, Earnings in England",
       x = "Year",
       y = "Amount (\u00a3000s)") +
  theme_minimal() +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    legend.position = "top"
  )

## Regional Analysis for 2022
## Filter data 2022
district_data_2022 <- filter(district_data,Year == 2022)%>%
  group_by(Region_Name) %>%
  mutate(
    Median_Value = median(Affordability_Ratio),
  )

# =================================================================
## FIGURE 3. Range of Housing Affordability Ratio by Region - 2022
district_summary_data_2022 <- district_data_2022 %>%
  group_by(Region_Name) %>%
  summarize(
    min_value = min(Affordability_Ratio, na.rm = TRUE),
    max_value = max(Affordability_Ratio, na.rm = TRUE),
    median_value = median(Affordability_Ratio, na.rm = TRUE),
    min_district = Local_Authority_Name[which.min(Affordability_Ratio)],
    max_district = Local_Authority_Name[which.max(Affordability_Ratio)],
  )
# Plot
ggplot(district_summary_data_2022, aes(y = reorder(Region_Name, median_value))) +
  geom_linerange(aes(xmin = min_value, xmax = max_value), color = "darkblue", linewidth = 7) +
  geom_point(aes(x = median_value), color = "skyblue", size = 3) +
  geom_text(aes(x = min_value, label = min_district), hjust = 1.1, color = "darkblue", size = 4) +
  geom_text(aes(x = max_value, label = max_district), hjust = -0.1, color = "darkblue", size = 4) +
  labs(
    title = "Range of Housing Affordability Ratio by Region (Min to Max) - 2022",
    x = "Affordability Ratio",
    y = "Region"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(), 
    axis.text.y = element_text(size = 12),
  ) +
  xlim(-5,48) 

# =========================
# TABLE 1. Top 10 affordable
top_10_affordable <- district_data_2022 %>%
  arrange(Affordability_Ratio) %>%
  select(Local_Authority_Name, Region_Name, Affordability_Ratio) %>%
  head(10)
# TABLE 2. Top 10 unaffordable
top_10_unaffordable <- district_data_2022 %>%
  arrange(desc(Affordability_Ratio)) %>%
  select(Local_Authority_Name, Region_Name, Affordability_Ratio) %>%
  head(10)

# =============================================
## FIGURE 4. Net additional dwellings over time
ggplot(housing_supply, aes(x = Year, y = Net_Additional_Dwellings/1000)) +
  geom_bar(stat = "identity", fill = "darkblue") +
  # Add threshold line at 300,000
  geom_hline(yintercept = 300, color = "red", linetype = "dashed", linewidth = 0.8) +
  scale_x_continuous(breaks = seq(1998, 2022, by = 2)) +
  # Labels and theme
  labs(
    title = "Net Additional Dwellings in England by Year",
    x = "Year",
    y = "(000s)"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank()
  )

# ============
### APPENDIX
# ============
## Appendix A. Divergence between House prices and 5-year earnings by region
region_data %>% filter(!Region_Name == 'England') %>%
  ggplot() +
  geom_line(aes(x = Year, y = Median_House_Price / 1000, color = "House Price (\u00a3000s)"), linewidth = 1) +
  geom_line(aes(x = Year, y = 5 * Median_Earnings / 1000, color = "5-year Earnings (\u00a3000s)"), linewidth = 1) +
  scale_color_manual(values = c("House Price (\u00a3000s)" = "darkblue", 
                                "5-year Earnings (\u00a3000s)" = "#56B4E9")) +
  # Facet by region name to create a separate plot for each region
  facet_wrap(~ Region_Name, ncol=3)+
  labs(title = "House prices and 5-year earnings by region",
       x = "Year",
       y = "Amount (\u00a3000s)") +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 14, face = "bold"),
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = "top",
  )

## Appendix B. Affordability Ratio by Local Authorities

# Merge spatial data with affordability data by local authority code
map_data_2022 <- uk_map %>%
  left_join(district_data_2022, by = c("LAD24CD" = "Local_Authority_Code")) %>%
  filter(!is.na(Affordability_Ratio) & Year == 2022)
# Spatial map
tm_shape(map_data_2022) +
  tm_fill(
    col = "Affordability_Ratio",
    palette = "YlGnBu",
    title = "Affordability Ratio",
    style = "fixed",
    breaks = c(0, 5, 8, 12, 24, 44),
    popup.vars = c(
      "Region" = "Region_Name",
      "Affordability Ratio" = "Affordability_Ratio"
    )
  ) +
  tm_borders(col = "#2F4F4F") +
  tm_layout(
    title = "Affordability Ratio by Region (2022)",
    legend.outside = TRUE,
    legend.outside.size = 0.3
  )